Download the maize B73 V4 data and sorghum genome file
wget ftp://ftp.ensemblgenomes.org/pub/plants/release-34/gff3/zea_mays/Zea_mays.AGPv4.34.gff3.gz
gunzip Zea_mays.AGPv4.34.gff3.gz
wget ftp://ftp.ensemblgenomes.org/pub/plants/release-34/fasta/zea_mays/dna/Zea_mays.AGPv4.dna.toplevel.fa.gz
gunzip Zea_mays.AGPv4.dna.toplevel.fa.gz
wget ftp://ftp.ensemblgenomes.org/pub/plants/release-49/fasta/sorghum_bicolor/dna/Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa.gz
gunzip Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa.gz
Using AnchorWave to extract full-length CDS.
NOTE: please do NOT use CDS extracted using other software and do NOT use the output full-length CDS file for other purpose. Since AnchorWave filtered some CDS records to minimum the impact of minimap2 limitation on genome alignment that “Minimap2 often misses small exons” (https://github.com/lh3/minimap2#limitations)
anchorwave gff2seq -r Zea_mays.AGPv4.dna.toplevel.fa -i Zea_mays.AGPv4.34.gff3 -o cds.fa
use minimap2 (https://github.com/lh3/minimap2) to map the extracted sequence to the reference genome sequence and synthesis genomes
minimap2 -x splice -t 11 -k 12 -a -p 0.4 -N 20 Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa cds.fa > cds.sam
minimap2 -x splice -t 11 -k 12 -a -p 0.4 -N 20 Zea_mays.AGPv4.dna.toplevel.fa cds.fa > ref.sam
prepare genome annotation files
all_reproducible_peaks_summits_merged.bed has been published at https://www.nature.com/articles/s41467-020-18832-8
wget https://github.com/mcstitzer/maize_TEs/raw/master/B73.structuralTEv2.disjoined.2018-09-19.gff3.gz
gunzip B73.structuralTEv2.disjoined.2018-09-19.gff3.gz
grep -v "#" B73.structuralTEv2.disjoined.2018-09-19.gff3 | awk '{print $1"\t"$4-1"\t"$5}' | bedtools sort | bedtools merge > B73.structuralTEv2.disjoined.2018-09-19.bed
grep "biotype=protein_coding" Zea_mays.AGPv4.34.gff3 | grep -e "\sgene\s" | awk '{print $1"\t"$4-1"\t"$5}' | bedtools merge > Zea_mays.AGPv4.34_coding_gene.bed
#total length of geneitc region
cat Zea_mays.AGPv4.34_coding_gene.bed | awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}' # 162494287
grep "\sCDS\s" Zea_mays.AGPv4.34.gff3 | awk '{print $1"\t"$4-1"\t"$5}' | bedtools sort | bedtools merge > Zea_mays.AGPv4.34_CDS.bed
bedtools subtract -a Zea_mays.AGPv4.34_coding_gene.bed -b Zea_mays.AGPv4.34_CDS.bed | bedtools sort | bedtools merge > Zea_mays.AGPv4.34_coding_gene_nonCDS.bed
perform genome alignment using minimap2 and summarize the result
/usr/bin/time minimap2 -x asm5 -t 1 -a Zea_mays.AGPv4.dna.toplevel.fa Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa > minimap2_sorghum_5.sam 2> minimap2_asm5.log
/usr/bin/time minimap2 -x asm10 -t 1 -a Zea_mays.AGPv4.dna.toplevel.fa Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa > minimap2_sorghum_10.sam 2> minimap2_asm10.log
/usr/bin/time minimap2 -x asm20 -t 1 -a Zea_mays.AGPv4.dna.toplevel.fa Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa > minimap2_sorghum_20.sam 2> minimap2_asm20.log
samtools sort minimap2_sorghum_5.sam > minimap2_sorghum_5.bam
samtools sort minimap2_sorghum_10.sam > minimap2_sorghum_10.bam
samtools sort minimap2_sorghum_20.sam > minimap2_sorghum_20.bam
samtools depth minimap2_sorghum_5.bam | wc -l #2503314
samtools depth minimap2_sorghum_5.bam -b all_reproducible_peaks_summits_merged.bed | wc -l # 457652
samtools depth minimap2_sorghum_5.bam | awk '$3>0{print $0}' | wc -l #2489881
samtools depth minimap2_sorghum_5.bam -b all_reproducible_peaks_summits_merged.bed | awk '$3>0{print $0}' | wc -l #456029
samtools depth minimap2_sorghum_5.bam -b Zea_mays.AGPv4.34_coding_gene.bed | wc -l # 2332093
samtools depth minimap2_sorghum_5.bam -b Zea_mays.AGPv4.34_coding_gene.bed | awk '$3>0{print $0}' | wc -l # 2320162
samtools depth minimap2_sorghum_5.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | wc -l # 353993
samtools depth minimap2_sorghum_5.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | awk '$3>0{print $0}' | wc -l # 348042
samtools depth minimap2_sorghum_10.bam | wc -l # 18445054
samtools depth minimap2_sorghum_10.bam -b all_reproducible_peaks_summits_merged.bed | wc -l # 3401133
samtools depth minimap2_sorghum_10.bam | awk '$3>0{print $0}' | wc -l # 17913090
samtools depth minimap2_sorghum_10.bam -b all_reproducible_peaks_summits_merged.bed | awk '$3>0{print $0}' | wc -l #3347285
samtools depth minimap2_sorghum_10.bam -b Zea_mays.AGPv4.34_coding_gene.bed | wc -l # 17619682
samtools depth minimap2_sorghum_10.bam -b Zea_mays.AGPv4.34_coding_gene.bed | awk '$3>0{print $0}' | wc -l # 17118348
samtools depth minimap2_sorghum_10.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | wc -l # 2354836
samtools depth minimap2_sorghum_10.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | awk '$3>0{print $0}' | wc -l # 2259444
samtools depth minimap2_sorghum_20.bam | wc -l # 52224881
samtools depth minimap2_sorghum_20.bam -b all_reproducible_peaks_summits_merged.bed | wc -l #11113820
samtools depth minimap2_sorghum_20.bam | awk '$3>0{print $0}' | wc -l # 49024035
samtools depth minimap2_sorghum_20.bam -b all_reproducible_peaks_summits_merged.bed | awk '$3>0{print $0}' | wc -l # 10680401
samtools depth minimap2_sorghum_20.bam -b Zea_mays.AGPv4.34_coding_gene.bed | wc -l # 47118588
samtools depth minimap2_sorghum_20.bam -b Zea_mays.AGPv4.34_coding_gene.bed | awk '$3>0{print $0}' | wc -l # 44182530
samtools depth minimap2_sorghum_20.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | wc -l # 7728976
samtools depth minimap2_sorghum_20.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | awk '$3>0{print $0}' | wc -l # 7132545
perform genome alignment using LAST(http://last.cbrc.jp/) and summarize the result
/usr/bin/time sh ./lastalPipeline.sh >last.log 2>&1
python2 maf-convert sam sorghum_lastal.maf > sorghum_lastal.sam
sed -i 's/[0-9]\+H//g' sorghum_lastal.sam
samtools view -O BAM --reference Zea_mays.AGPv4.dna.toplevel.fa sorghum_lastal.sam | samtools sort - > sorghum_lastal.bam; samtools index sorghum_lastal.bam # this is the many-to-many alignment
samtools depth sorghum_lastal.bam | wc -l # 597884081
samtools depth sorghum_lastal.bam | awk '$3>0{print $0}' | wc -l # 594220046
samtools depth sorghum_lastal.bam -b all_reproducible_peaks_summits_merged.bed | wc -l # 33554729
samtools depth sorghum_lastal.bam -b all_reproducible_peaks_summits_merged.bed | awk '$3>0{print $0}' | wc -l # 32598984
samtools depth sorghum_lastal.bam -b Zea_mays.AGPv4.34_coding_gene.bed | wc -l # 118438260
samtools depth sorghum_lastal.bam -b Zea_mays.AGPv4.34_coding_gene.bed | awk '$3>0{print $0}' | wc -l # 116732768
samtools depth sorghum_lastal.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | wc -l # 365399859
samtools depth sorghum_lastal.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | awk '$3>0{print $0}' | wc -l # 364395540
python2 maf-convert sam sorghum_lastal_final.maf > sorghum_lastal_final.sam # this many to one
sed -i 's/query.//g' sorghum_lastal_final.sam
sed -i 's/col.//g' sorghum_lastal_final.sam
sed -i 's/[0-9]\+H//g' sorghum_lastal_final.sam
samtools view -O BAM --reference Zea_mays.AGPv4.dna.toplevel.fa sorghum_lastal_final.sam | samtools sort - > sorghum_lastal_final.bam; samtools index sorghum_lastal_final.bam
samtools depth sorghum_lastal_final.bam | wc -l # 547513366
samtools depth sorghum_lastal_final.bam -b all_reproducible_peaks_summits_merged.bed | wc -l # 32593361
samtools depth sorghum_lastal_final.bam | awk '$3>0{print $0}' | wc -l # 541455727
samtools depth sorghum_lastal_final.bam -b all_reproducible_peaks_summits_merged.bed | awk '$3>0{print $0}' | wc -l # 31471791
samtools depth sorghum_lastal_final.bam -b Zea_mays.AGPv4.34_coding_gene.bed | wc -l # 113229899
samtools depth sorghum_lastal_final.bam -b Zea_mays.AGPv4.34_coding_gene.bed | awk '$3>0{print $0}' | wc -l # 110793737
samtools depth sorghum_lastal_final.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | wc -l # 337208689
samtools depth sorghum_lastal_final.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | awk '$3>0{print $0}' | wc -l # 335012548
python2 maf-convert sam sorghum_lastal_final_split_swap_split_Comparator.maf > sorghum_lastal_final_split_swap_split_Comparator.sam # one-to-one
sed -i 's/query.//g' sorghum_lastal_final_split_swap_split_Comparator.sam
sed -i 's/col.//g' sorghum_lastal_final_split_swap_split_Comparator.sam
sed -i 's/[0-9]\+H//g' sorghum_lastal_final_split_swap_split_Comparator.sam
samtools view -O BAM --reference Zea_mays.AGPv4.dna.toplevel.fa sorghum_lastal_final_split_swap_split_Comparator.sam | samtools sort - > sorghum_lastal_final_split_swap_split_Comparator.bam; samtools index sorghum_lastal_final_split_swap_split_Comparator.bam
samtools depth sorghum_lastal_final_split_swap_split_Comparator.bam | wc -l # 129898533
samtools depth sorghum_lastal_final_split_swap_split_Comparator.bam -b all_reproducible_peaks_summits_merged.bed | wc -l # 23170062
samtools depth sorghum_lastal_final_split_swap_split_Comparator.bam | awk '$3>0{print $0}' | wc -l # 127859061
samtools depth sorghum_lastal_final_split_swap_split_Comparator.bam -b all_reproducible_peaks_summits_merged.bed | awk '$3>0{print $0}' | wc -l # 22483132
samtools depth sorghum_lastal_final_split_swap_split_Comparator.bam -b Zea_mays.AGPv4.34_coding_gene.bed | wc -l # 62730883
samtools depth sorghum_lastal_final_split_swap_split_Comparator.bam -b Zea_mays.AGPv4.34_coding_gene.bed | awk '$3>0{print $0}' | wc -l # 61595170
samtools depth sorghum_lastal_final_split_swap_split_Comparator.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | wc -l # 46540254
samtools depth sorghum_lastal_final_split_swap_split_Comparator.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | awk '$3>0{print $0}' | wc -l # 46169395
perform genome alignment using AnchorWave and summarize the result
/usr/bin/time ./code/anchorwave proali -i Zea_mays.AGPv4.34.gff3 -as cds.fa -r Zea_mays.AGPv4.dna.toplevel.fa -a cds.sam -ar ref.sam -s Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa -n anchorwave.anchors -R 1 -Q 2 -o anchorwave.maf -f anchorwave.f.maf -w 38000 -fa3 200000 -B -4 -O1 -4 -E1 -2 -O2 -80 -E2 -1 -t 1 >anchorwave.log 2>&1
maf-convert sam anchorwave.maf | sed 's/Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa.//g' | sed 's/Zea_mays.AGPv4.dna.toplevel.fa.//g' | samtools view -O BAM --reference Zea_mays.AGPv4.dna.toplevel.fa - | samtools sort - > anchorwave.bam
samtools mpileup anchorwave.bam | wc -l # 1870770572
samtools depth anchorwave.bam -b all_reproducible_peaks_summits_merged.bed | wc -l # 57164808
samtools depth anchorwave.bam | awk '$3>0{print $0}' | wc -l # 125848664
samtools depth anchorwave.bam -b all_reproducible_peaks_summits_merged.bed | awk '$3>0{print $0}' | wc -l # 34133000
samtools depth anchorwave.bam -b Zea_mays.AGPv4.34_coding_gene.bed | wc -l # 150034551
samtools depth anchorwave.bam -b Zea_mays.AGPv4.34_coding_gene.bed | awk '$3>0{print $0}' | wc -l #68301524
samtools depth anchorwave.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | wc -l # 1308576282
samtools depth anchorwave.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | awk '$3>0{print $0}' | wc -l # 27551272
perform genome alignment using MUMmer4 vRC1 (https://mummer4.github.io/) and summarize the result
#/usr/bin/time /home/bs674/software/bin/nucmer -t 60 --sam-short=mumer.maize.short.sam Zea_mays.AGPv4.dna.toplevel.fa Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa > nucmer.log 2>&1
#/usr/bin/time /home/bs674/software/bin/nucmer -t 69 --sam-long=mumer.maize.long.sam Zea_mays.AGPv4.dna.toplevel.fa Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa > nucmer.long.log 2>&1
/usr/bin/time /home/bs674/software/bin/nucmer -t 1 --sam-short=mumer.maize.short.sam Zea_mays.AGPv4.dna.toplevel.fa Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa > nucmer.log 2>&1
cat mumer.maize.short.sam | grep -v "@" | samtools view -O BAM --reference Zea_mays.AGPv4.dna.toplevel.fa - | samtools sort - > mumer.maize.short.bam
samtools index mumer.maize.short.bam
samtools depth mumer.maize.short.bam | wc -l # 70953295
samtools depth mumer.maize.short.bam | awk '$3>0{print $0}' | wc -l #69514918
samtools depth mumer.maize.short.bam -b all_reproducible_peaks_summits_merged.bed | wc -l # 16073863
samtools depth mumer.maize.short.bam -b all_reproducible_peaks_summits_merged.bed | awk '$3>0{print $0}' | wc -l # 15687267
samtools depth mumer.maize.short.bam -b Zea_mays.AGPv4.34_coding_gene.bed | wc -l # 61402015
samtools depth mumer.maize.short.bam -b Zea_mays.AGPv4.34_coding_gene.bed | awk '$3>0{print $0}' | wc -l # 60194318
samtools depth mumer.maize.short.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | wc -l # 10169800
samtools depth mumer.maize.short.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | awk '$3>0{print $0}' | wc -l # 9987182
perform genome alignment using GSAlign(https://github.com/hsinnan75/GSAlign) and summarize the result
/usr/bin/time /home/bs674/software/GSAlign-1.0.22/bin/GSAlign -r Zea_mays.AGPv4.dna.toplevel.fa -q Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa -t 1 -o sorghum_gsalign -fmt 1 > GSAlign.log 2>&1
maf-convert sam sorghum_gsalign.maf | sed 's/qry.//g' | sed 's/ref.//g' | samtools view -O BAM --reference Zea_mays.AGPv4.dna.toplevel.fa - | samtools sort - > sorghum_gsalign.bam
samtools index sorghum_gsalign.bam
samtools depth sorghum_gsalign.bam -b all_reproducible_peaks_summits_merged.bed | wc -l # 5341494
samtools depth sorghum_gsalign.bam | wc -l # 23870692
samtools depth sorghum_gsalign.bam | awk '$3>0{print $0}' | wc -l # 23114133
samtools depth sorghum_gsalign.bam -b all_reproducible_peaks_summits_merged.bed | awk '$3>0{print $0}' | wc -l # 5138842
samtools depth sorghum_gsalign.bam -b Zea_mays.AGPv4.34_coding_gene.bed | wc -l # 21659744
samtools depth sorghum_gsalign.bam -b Zea_mays.AGPv4.34_coding_gene.bed | awk '$3>0{print $0}' | wc -l #21015569
samtools depth sorghum_gsalign.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | wc -l # 3322894
samtools depth sorghum_gsalign.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | awk '$3>0{print $0}' | wc -l # 3214544
dat = read.table("all_reproducible_peaks_summits_merged.bed")
dat$len = dat$V3-dat$V2
sum(abs(dat$len))
## [1] 60036753
#60036753
TE2018 = read.table("B73.structuralTEv2.disjoined.2018-09-19.bed")
TE2018$len = TE2018$V3-TE2018$V2
TE2018TotalLength = sum(abs(TE2018$len)) # 1495364259
library(ggplot2)
data = read.table("summaryData", header=TRUE)
data$recall = data$depth_covered_bed / data$bed
data$non_TFBS_matches = data$depth_covered - data$depth_covered_bed
data$non_TFBS = data$depth - data$depth_bed
data$enrichment = (data$depth_covered_bed/data$depth_bed)/(data$non_TFBS_matches/data$non_TFBS)
data$approach <- factor(data$approach, levels = c("AnchorWave", "minimap2_asm5", "minimap2_asm10", "minimap2_asm20", "LAST+many-to-many", "LAST+many-to-one", "LAST+one-to-one", "MUMmer4", "GSAlign"))
data$software <- factor(data$software, levels = c("AnchorWave", "minimap2", "LAST", "MUMmer4", "GSAlign"))
shape = c(16, 17, 17, 17, 15, 15, 15, 3, 7)
myColors <- c("#F8766D", "#D89000", "#A3A500", "#39B600", "#00BFC4", "#00B0F6", "#9590FF", "#E76BF3", "#FF62BC")
names(myColors) <- levels(data$approach)
colScale <- scale_colour_manual(name = "grp",values = myColors)
fillScale <- scale_fill_manual(name = "grp",values = myColors)
print(data)
## software approach depth depth_bed depth_covered
## 1 minimap2 minimap2_asm5 2503314 457652 2489881
## 2 minimap2 minimap2_asm10 18445054 3401133 17913090
## 3 minimap2 minimap2_asm20 52224881 11113820 49024035
## 4 MUMmer4 MUMmer4 70953295 16073863 69514918
## 5 GSAlign GSAlign 23870692 5341494 23114133
## 6 LAST LAST+many-to-many 597884081 33554729 594220046
## 7 LAST LAST+many-to-one 547513366 32593361 541455727
## 8 LAST LAST+one-to-one 129898533 23170062 127859061
## 9 AnchorWave AnchorWave 1870770572 57164808 125848664
## depth_covered_bed bed depth_genetic depth_genetic_covered X2018.09.19
## 1 456029 60036753 2332093 2320162 353993
## 2 3347285 60036753 17619682 17118348 2354836
## 3 10680401 60036753 47118588 44182530 7728976
## 4 15687267 60036753 61402015 60194318 10169800
## 5 5138842 60036753 21659744 21015569 3322894
## 6 32598984 60036753 118438260 116732768 365399859
## 7 31471791 60036753 113229899 110793737 337208689
## 8 22483132 60036753 62730883 61595170 46540254
## 9 34133000 60036753 150034551 68301524 1308576282
## X2018.09.19_covered recall non_TFBS_matches non_TFBS enrichment
## 1 348042 0.007595831 2033852 2045662 1.0022398
## 2 2259444 0.055753931 14565805 15043921 1.0164725
## 3 7132545 0.177897712 38343634 41111061 1.0303615
## 4 9987182 0.261294394 53827651 54879432 0.9950186
## 5 3214544 0.085594935 17975291 18529198 0.9917066
## 6 364395540 0.542983795 561621062 564329352 0.9762018
## 7 335012548 0.524208746 509983936 514920005 0.9749348
## 8 46169395 0.374489473 105375929 106728471 0.9828076
## 9 27551272 0.568535077 91715664 1813605764 11.8071501
p = ggplot(data=data, aes(x=recall, y=enrichment, color=approach, shape=software)) + geom_point(size=5, alpha=0.8) +
guides(colour = guide_legend(override.aes = list(shape = shape))) +scale_shape(guide = FALSE)+ colScale+fillScale+
labs(x="TFBS recall", y="Match ratio in TFBS\nalignment to match ratio\n in non-TFBS alignment", title="")+ xlim(0, 1) + ylim(0, 12) +
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text( colour = "black"),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
file = "TFBS_quality_control"
png(paste(file, ".png", sep=""), width=720, height=560)
print(p)
dev.off()
## png
## 2
pdf(paste(file, ".pdf", sep=""), width=9, height=7)
print(p)
dev.off()
## png
## 2
Define the “recall” as (# position-match base pairs in TFBS and coding genetic region)/(total TFBS length + coding genetic region) By “coding genetic region”, I mean from TSS to TTS of coding genes (inlcuding introns, UTR, CDS) Define “precision” as (# position-match base pairs in TFBS and coding genetic region)/(# position-match base pairs in TFBS and coding genetic region + # position-match base pairs in TE region)
This is not a very solid analysis. But did not find much more convincing way to do that.
data$TFBS_coding_genes_recall = (data$depth_covered_bed+data$depth_genetic_covered) / (data$bed+161046032)
data$TFBS_coding_genes_precision = (data$depth_covered_bed+data$depth_genetic_covered)/(data$depth_covered_bed+data$depth_genetic_covered + data$X2018.09.19_covered)
data$TFBS_coding_genes_fscore = 2*(data$TFBS_coding_genes_precision*data$TFBS_coding_genes_recall)/(data$TFBS_coding_genes_precision+data$TFBS_coding_genes_recall)
p = ggplot(data=data, aes(x=approach, y=TFBS_coding_genes_fscore)) + geom_bar(stat="identity", alpha=0.8) +guides(color = FALSE, fill=FALSE)+
labs(x="", y="F-score", title="TFBS+coding_genes") + colScale+fillScale+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
p = ggplot(data=data, aes(x=TFBS_coding_genes_recall, y=TFBS_coding_genes_precision, color=software, shape=software)) + geom_point(alpha=0.8) +
labs(x="recall", y="precision", title="TFBS+coding_genes") + colScale+fillScale+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
data$coding_genes_recall = (data$depth_genetic_covered) / (161046032)
data$coding_genes_precision = (data$depth_genetic_covered)/(data$depth_genetic_covered + data$X2018.09.19_covered)
data$coding_genes_fscore = 2*(data$coding_genes_precision*data$coding_genes_recall)/(data$coding_genes_precision+data$coding_genes_recall)
p = ggplot(data=data, aes(x=approach, y=coding_genes_fscore)) + geom_bar(stat="identity", alpha=0.8) +guides(color = FALSE, fill=FALSE)+
labs(x="", y="F-score", title="coding_genes") + colScale+fillScale+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
p = ggplot(data=data, aes(x=coding_genes_recall, y=coding_genes_precision, color=software, shape=software)) + geom_point(alpha=0.8) +guides(color = FALSE, fill=FALSE)+
labs(x="recall", y="precision", title="coding_genes") + colScale+fillScale+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
data$TFBS_recall = (data$depth_covered_bed) / (data$bed)
data$TFBS_precision = (data$depth_covered_bed)/(data$depth_covered_bed + data$X2018.09.19_covered)
data$TFBS_fscore = 2*(data$TFBS_precision*data$TFBS_recall)/(data$TFBS_precision+data$TFBS_recall)
p = ggplot(data=data, aes(x=approach, y=TFBS_fscore)) + geom_bar(stat="identity", alpha=0.8)+
labs(x="", y="F-score", title="TFBS") + colScale+fillScale+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
p = ggplot(data=data, aes(x=TFBS_recall, y=TFBS_precision, color=software, shape=software)) + geom_point(alpha=0.8) +
labs(x="recall", y="precision", title="TFBS") + colScale+fillScale+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
p = ggplot(data=data, aes(x=approach, y=recall, color=approach, fill=approach)) + geom_bar(stat="identity", alpha=0.8) +guides(color = FALSE, fill=FALSE)+
labs(x="", y="Proportion of maize\nTFBS matched to\nsorghum genome", title="") + colScale+fillScale+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text( colour = "black"),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
# axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
file = "TFBS_recall"
png(paste(file, ".png", sep=""), width=720, height=320)
print(p)
dev.off()
## png
## 2
pdf(paste(file, ".pdf", sep=""), width=9, height=4)
print(p)
dev.off()
## png
## 2
totalGenomeLength = read.table("Zea_mays.AGPv4.dna.toplevel.fa.fai")
totalGenomeLength = sum(totalGenomeLength$V2)
data$coverage = data$depth/totalGenomeLength
dat = rbind( data.frame(approach=data$approach, proportion=data$depth_covered/totalGenomeLength, category = "position match"), data.frame(approach=data$approach, proportion=(data$depth-data$depth_covered)/totalGenomeLength, category = "gap"), data.frame(approach=data$approach, proportion=(totalGenomeLength-data$depth)/totalGenomeLength, category = "unaligned") )
dat$category = factor(dat$category, levels=c("unaligned", "gap", "position match"))
p = ggplot(data=dat, aes(x=approach, y=proportion, fill=category)) + geom_bar(stat="identity") + scale_fill_manual(values=c("#54AEE1", "#92A000", "#EF8600")) + guides(fill=guide_legend(nrow=1,byrow=TRUE))+
labs(x="", y="Proportion of maize \n genome aligned to sorghum", title="")+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
legend.position = "top",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text( colour = "black"),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
file = "genome_aligned"
png(paste(file, ".png", sep=""), width=720, height=560)
print(p)
dev.off()
## png
## 2
pdf(paste(file, ".pdf", sep=""), width=9, height=7)
print(p)
dev.off()
## png
## 2
data0 = data.frame(software=c("ideal expection", as.character(data$software)), approach=c("ideal expection", as.character(data$approach)), coverage=c(0, (data$X2018.09.19_covered)/data$depth_covered) )
data0$approach <- factor(data0$approach, levels = c("ideal expection", "AnchorWave", "minimap2_asm5", "minimap2_asm10", "minimap2_asm20","LAST+many-to-many", "LAST+many-to-one", "LAST+one-to-one", "MUMmer4", "GSAlign"))
data0$software <- factor(data0$software, levels = c("ideal expection", "AnchorWave", "minimap2", "LAST", "MUMmer4", "GSAlign"))
myColors <- c("#00FF00", "#F8766D", "#D89000", "#A3A500", "#39B600", "#00BFC4", "#00B0F6", "#9590FF", "#E76BF3", "#FF62BC")
names(myColors) <- levels(data$approach)
colScale <- scale_colour_manual(name = "grp",values = myColors)
fillScale <- scale_fill_manual(name = "grp",values = myColors)
p = ggplot(data=data0, aes(x=approach, y=coverage, color=approach, fill=approach)) + geom_bar(stat="identity", alpha=0.8) +guides(color = FALSE, fill = FALSE)+
labs(x="", y="Proportion of maize-sorghum\ngenome alignmenet matchs\nin maize TE region", title="")+ylim(0, 1)+colScale+fillScale+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text( colour = "black"),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = c("#00FF00", "#000000", "#000000", "#000000", "#000000", "#000000", "#000000", "#000000", "#000000", "#000000", "#000000")))
print(p)
file = "2018.09.19_te_depth"
png(paste(file, ".png", sep=""), width=720, height=560)
print(p)
dev.off()
## png
## 2
pdf(paste(file, ".pdf", sep=""), width=9, height=7)
print(p)
dev.off()
## png
## 2
data$X2018.09.19_gaps = data$X2018.09.19 - data$X2018.09.19_covered
data$total_gaps = data$depth - data$depth_covered
data$non_TE_gaps = data$total_gaps - data$X2018.09.19_gaps
data$non_TE_matches = data$depth_covered - data$X2018.09.19_covered
data$non_TE = data$depth - data$X2018.09.19
data0 = data.frame(software=data$software, approach=data$approach, coverage=((data$X2018.09.19_gaps/data$X2018.09.19) / (data$non_TE_gaps/data$non_TE ) )) # the value of 162494287 is the total genetic region length, which is counted above in this file
p = ggplot(data=data0, aes(x=approach, y=coverage, color=approach, fill=approach)) + geom_bar(stat="identity", alpha=0.8) +guides(color = FALSE, fill = FALSE)+ylim(0, 5)+geom_hline(yintercept=1, linetype="dashed", color = "gray", size=1) +
labs(x="", y="Gap ratio in TE\nalignment to gap ratio\nin non-TE alignment", title="")+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text( colour = "black"),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
file = "2018.09.19_te_gap_ratio"
png(paste(file, ".png", sep=""), width=720, height=560)
print(p)
dev.off()
## png
## 2
pdf(paste(file, ".pdf", sep=""), width=9, height=7)
print(p)
dev.off()
## png
## 2
data0 = data.frame(software=data$software, approach=data$approach, coverage=( (data$non_TE_gaps/data$non_TE ) )) # the value of 162494287 is the total genetic region length, which is counted above in this file
p = ggplot(data=data0, aes(x=approach, y=coverage, color=approach, fill=approach)) + geom_bar(stat="identity", alpha=0.8) +guides(color = FALSE, fill = FALSE)+ylim(0, 1.1)+geom_hline(yintercept=1, linetype="dashed", color = "gray", size=1) +
labs(x="", y="Gap ratio in non-TE", title="")+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text( colour = "black"),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
# axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
data0 = data.frame(software=data$software, approach=data$approach, coverage=( (data$X2018.09.19_gaps/data$X2018.09.19 ) )) # the value of 162494287 is the total genetic region length, which is counted above in this file
p = ggplot(data=data0, aes(x=approach, y=coverage, color=approach, fill=approach)) + geom_bar(stat="identity", alpha=0.8) +guides(color = FALSE, fill = FALSE)+ylim(0, 1.1)+geom_hline(yintercept=1, linetype="dashed", color = "gray", size=1) +
labs(x="", y="Gap ratio in TE", title="")+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text( colour = "black"),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
# axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
data0 = data.frame(software=data$software, approach=data$approach, coverage=((data$X2018.09.19_covered/data$X2018.09.19) / (data$non_TE_matches/data$non_TE ) )) # the value of 162494287 is the total genetic region length, which is counted above in this file
p = ggplot(data=data0, aes(x=approach, y=coverage, color=approach, fill=approach)) + geom_bar(stat="identity", alpha=0.8) +guides(color = FALSE, fill = FALSE)+ylim(0, 1.1)+geom_hline(yintercept=1, linetype="dashed", color = "gray", size=1) +
labs(x="", y="Match ratio in TE\nalignment to match ratio\n in non-TE alignment", title="")+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text( colour = "black"),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
# axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
file = "2018.09.19_te_match_ratio"
png(paste(file, ".png", sep=""), width=720, height=320)
print(p)
dev.off()
## png
## 2
pdf(paste(file, ".pdf", sep=""), width=9, height=4)
print(p)
dev.off()
## png
## 2
data0 = data.frame(software=data$software, approach=data$approach, ratio=((data$X2018.09.19-data$X2018.09.19_covered)/data$X2018.09.19), coverage=(data$X2018.09.19 - data$X2018.09.19_covered)/TE2018TotalLength )
p = ggplot(data=data0, aes(x=coverage, y=ratio, color=approach, shape=software)) + geom_point(size=5, alpha=0.8) +
guides(colour = guide_legend(override.aes = list(shape = shape))) +scale_shape(guide = FALSE)+
labs(x="Gap ratio in maize TEs(recall) of\nmaize to sorghum genome alignment", y="Gap ratio in maize TE alignments\n(precision)", title="")+xlim(0,1)+ylim(0,1)+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text( colour = "black"),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
file = "2018.09.19_te_depth_2"
png(paste(file, ".png", sep=""), width=720, height=560)
print(p)
dev.off()
## png
## 2
pdf(paste(file, ".pdf", sep=""), width=9, height=7)
print(p)
dev.off()
## png
## 2
# here I am using the Cairo library to compile the output plot. The output file looks better than native library, but it a little bit of mass up the Rmarkdown output file.
library(ggplot2)
library(compiler)
enableJIT(3)
## [1] 3
library(ggplot2)
library("Cairo")
changetoM <- function ( position ){
position=position/1000000;
paste(position, "M", sep="")
}
data =read.table("anchorwave.anchors", head=TRUE)
data$refChr = paste("chr", data$refChr, sep="")
data$queryChr = paste("chr", data$queryChr, sep="")
data = data[which(data$refChr %in% c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10" )),]
data = data[which(data$queryChr %in% c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10" )),]
data$refChr = factor(data$refChr, levels=c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10" ))
data$queryChr = factor(data$queryChr, levels=c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10" ))
data$strand = factor(data$strand)
data = data[which(data$gene != "intergenetic"),]
p = ggplot(data=data, aes(x=queryStart, y=referenceStart))+geom_point(size=2, aes(color=factor(strand)))+facet_grid(refChr~queryChr, scales="free", space="free" )+ theme_grey(base_size = 60) +
labs(x="sorghum", y="maize")+scale_x_continuous(labels=changetoM) + scale_y_continuous(labels=changetoM) +
theme(axis.line = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(fill=NA,color="black", size=0.5, linetype="solid"),
axis.text.y = element_text( colour = "black"),
legend.position='none',
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black") )
CairoPNG(file="anchors.png",width = 4800, height = 3200)
p
dev.off()
## png
## 2
CairoPDF(file="anchors.pdf",width = 60, height = 40)
p
dev.off()
## png
## 2
data =read.table("anchorwave.anchors", head=TRUE)
data$refChr = paste("chr", data$refChr, sep="")
data$queryChr = paste("chr", data$queryChr, sep="")
data = data[which(data$refChr %in% c("chr4", "chr5")),]
data = data[which(data$queryChr %in% c("chr4", "chr5" )),]
data$refChr = factor(data$refChr, levels=c( "chr4", "chr5" ))
data$queryChr = factor(data$queryChr, levels=c( "chr4", "chr5" ))
data$strand = factor(data$strand)
data = data[which(data$gene != "intergenetic"),]
p = ggplot(data=data, aes(x=queryStart, y=referenceStart))+geom_point(size=2, aes(color=factor(strand)))+facet_grid(refChr~queryChr, scales="free", space="free" )+ theme_grey(base_size = 24) +
labs(x="sorghum", y="maize")+scale_x_continuous(labels=changetoM) + scale_y_continuous(labels=changetoM) +
theme(axis.line = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(fill=NA,color="black", size=0.5, linetype="solid"),
axis.text.y = element_text( colour = "black"),
legend.position='none',
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black") )
CairoPNG(file="anchor2.png",width = 720, height = 560)
p
dev.off()
## png
## 2
CairoPDF(file="anchor2.pdf",width = 9, height = 7)
p
dev.off()
## png
## 2
# this plot suggested there is no Interchromosomal translocations happened